/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.io;
import java.io.*;
import java.util.*;
import java.util.logging.*;
import org.apache.lucene.util.PriorityQueue;
import net.nutch.util.*;
/** Support for flat files of binary key/value pairs. */
public class SequenceFile {
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.io.SequenceFile");
private SequenceFile() {} // no public ctor
private static byte[] VERSION = new byte[] {
(byte)'S', (byte)'E', (byte)'Q', 1
};
/** Write key/value pairs to a sequence-format file. */
public static class Writer {
private BufferedRandomAccessFile out;
private DataOutputBuffer buffer = new DataOutputBuffer();
private Class keyClass;
private Class valClass;
/** Create the named file. */
public Writer(String name, Class keyClass, Class valClass)
throws IOException {
File file = new File(name);
if (file.exists())
throw new IOException("already exists: " + file);
init(new BufferedRandomAccessFile(name, false), keyClass, valClass);
}
/** Write to an arbitrary stream using a specified buffer size. */
private Writer(BufferedRandomAccessFile out,
Class keyClass, Class valClass) throws IOException {
init(out, keyClass, valClass);
}
private void init(BufferedRandomAccessFile out,
Class keyClass, Class valClass) throws IOException {
this.out = out;
this.out.write(VERSION);
this.keyClass = keyClass;
this.valClass = valClass;
new UTF8(WritableName.getName(keyClass)).write(this.out);
new UTF8(WritableName.getName(valClass)).write(this.out);
}
/** Returns the class of keys in this file. */
public Class getKeyClass() { return keyClass; }
/** Returns the class of values in this file. */
public Class getValueClass() { return valClass; }
/** Close the file. */
public void close() throws IOException {
if (out != null) {
out.close();
out = null;
}
}
/** Append a key/value pair. */
public void append(Writable key, Writable val) throws IOException {
if (key.getClass() != keyClass)
throw new IOException("wrong key class: "+key+" is not "+keyClass);
if (val.getClass() != valClass)
throw new IOException("wrong value class: "+val+" is not "+valClass);
buffer.reset();
key.write(buffer);
int keyLength = buffer.getLength();
if (keyLength == 0)
throw new IOException("zero length keys not allowed: " + key);
val.write(buffer);
//System.out.println("Appending " + key + ", " + val);
append(buffer.getData(), 0, buffer.getLength(), keyLength);
}
/** Append a key/value pair. */
public void append(byte[] data, int start, int length, int keyLength)
throws IOException {
if (keyLength == 0)
throw new IOException("zero length keys not allowed");
out.writeInt(length); // total record length
out.writeInt(keyLength); // key portion length
out.write(data, start, length); // data
}
/** Returns the current length of the output file. */
public long getLength() {
return out.getFilePointer();
}
}
/** Writes key/value pairs from a sequence-format file. */
public static class Reader {
private String file;
private BufferedRandomAccessFile in;
private DataOutputBuffer outBuf = new DataOutputBuffer();
private DataInputBuffer inBuf = new DataInputBuffer();
private Class keyClass;
private Class valClass;
private long end;
private int keyLength;
/** Open the named file. */
public Reader(String file) throws IOException {
this(file, 4096);
}
private Reader(String file, int bufferSize) throws IOException {
this.file = file;
this.in = new BufferedRandomAccessFile(file, bufferSize, true);
this.end = new File(file).length();
init();
}
private Reader(String file, int bufferSize, long start, long length)
throws IOException {
this.file = file;
this.in = new BufferedRandomAccessFile(file, bufferSize, true);
seek(start);
init();
this.end = in.getFilePointer() + length;
}
private void init() throws IOException {
byte[] version = new byte[VERSION.length];
in.readFully(version);
if (!Arrays.equals(version, VERSION))
throw new VersionMismatchException(VERSION[3], version[3]);
UTF8 className = new UTF8();
className.readFields(in); // read key class name
this.keyClass = WritableName.getClass(className.toString());
className.readFields(in); // read val class name
this.valClass = WritableName.getClass(className.toString());
}
/** Close the file. */
public synchronized void close() throws IOException {
in.close();
}
/** Returns the class of keys in this file. */
public Class getKeyClass() { return keyClass; }
/** Returns the class of values in this file. */
public Class getValueClass() { return valClass; }
/** Read the next key in the file into <code>key</code>, skipping its
* value. True if another entry exists, and false at end of file. */
public synchronized boolean next(Writable key) throws IOException {
if (key.getClass() != keyClass)
throw new IOException("wrong key class: "+key+" is not "+keyClass);
outBuf.reset();
keyLength = next(outBuf);
if (keyLength < 0)
return false;
inBuf.reset(outBuf.getData(), outBuf.getLength());
key.readFields(inBuf);
if (inBuf.getPosition() != keyLength)
throw new IOException(key + " read " + inBuf.getPosition()
+ " bytes, should read " + keyLength);
return true;
}
/** Read the next key/value pair in the file into <code>key</code> and
* <code>val</code>. Returns true if such a pair exists and false when at
* end of file */
public synchronized boolean next(Writable key, Writable val)
throws IOException {
if (val.getClass() != valClass)
throw new IOException("wrong value class: "+val+" is not "+valClass);
boolean more = next(key);
if (more) {
val.readFields(inBuf);
if (inBuf.getPosition() != outBuf.getLength())
throw new IOException(val+" read "+(inBuf.getPosition()-keyLength)
+ " bytes, should read " +
(outBuf.getLength()-keyLength));
}
return more;
}
/** Read the next key/value pair in the file into <code>buffer</code>.
* Returns the length of the key read, or -1 if at end of file. The length
* of the value may be computed by calling buffer.getLength() before and
* after calls to this method. */
public synchronized int next(DataOutputBuffer buffer) throws IOException {
if (in.getFilePointer() >= end)
return -1;
int length = in.readInt();
int keyLength = in.readInt();
buffer.write(in, length);
return keyLength;
}
/** Set the current byte position in the input file. */
public synchronized void seek(long position) throws IOException {
in.seek(position);
}
/** Return the current byte position in the input file. */
public synchronized long getPosition() {
return in.getFilePointer();
}
/** Returns the name of the file. */
public String toString() {
return file;
}
}
/** Adds buffering to {@link RandomAccessFile}, which is not an InputStream
* or an OutputStream, so BufferedInputStream and BufferredOutputStream
* cannot be used. */
private static class BufferedRandomAccessFile extends RandomAccessFile {
private byte[] buf;
private int pos;
private int count;
private long filePointer;
private boolean isReadOnly; // if false, then writeOnly
public BufferedRandomAccessFile(String file, boolean isReadOnly)
throws IOException {
this(file, 4096, isReadOnly);
}
public BufferedRandomAccessFile(String file, int bufLen,
boolean isReadOnly) throws IOException {
super(file, isReadOnly ? "r" : "rw");
this.buf = new byte[bufLen];
this.isReadOnly = isReadOnly;
}
/** Override unbuffered implementation. */
public int read() throws IOException {
if (pos >= count) {
fill();
if (pos >= count)
return -1;
}
return buf[pos++] & 0xff;
}
private void fill() throws IOException {
if (!isReadOnly) throw new IOException("can't read write-only file");
pos = 0;
count = pos;
int n = super.read(buf, 0, buf.length);
if (n > 0) {
count = n; // update count
filePointer += n; // update pointer
}
}
/** Override unbuffered implementation. */
public int read(byte[] b, int off, int len) throws IOException {
int avail = count - pos;
if (avail <= 0) {
if (len >= buf.length) {
int n = super.read(b, off, len);
if (n > 0)
filePointer += n; // update pointer
return n;
}
fill();
avail = count - pos;
if (avail <= 0) return -1;
}
int cnt = (avail < len) ? avail : len;
System.arraycopy(buf, pos, b, off, cnt);
pos += cnt;
return cnt;
}
/** Override unbuffered implementation. */
public int read(byte b[]) throws IOException {
return read(b, 0, b.length);
}
/** Override unbuffered implementation. */
public void write(int b) throws IOException {
if (count >= buf.length) {
flushBuffer();
}
buf[count++] = (byte)b;
}
/** Override unbuffered implementation. */
public void write(byte b[], int off, int len) throws IOException {
if (len >= buf.length) {
flushBuffer();
super.write(b, off, len);
filePointer += len;
return;
}
if (len > buf.length - count) {
flushBuffer();
}
System.arraycopy(b, off, buf, count, len);
count += len;
}
/** Override unbuffered implementation. */
public void write(byte b[]) throws IOException {
write(b, 0, b.length);
}
private void flushBuffer() throws IOException {
if (count > 0) {
super.write(buf, 0, count);
filePointer += count;
count = 0;
}
}
/** Override unbuffered implementation. */
public void seek(long desired) throws IOException {
if (!isReadOnly) throw new IOException("can't seek write-only file");
long current = getFilePointer();
long start = (current - pos);
if (desired >= start && desired < start + count) {
// can position within buffer
pos += (desired - current);
} else {
count = 0; // invalidate buffer
pos = 0;
super.seek(desired); // seek underlying stream
filePointer = desired; // update pointer
}
}
/** Override unbuffered implementation. */
public long getFilePointer() {
if (isReadOnly)
return filePointer - (count - pos);
else
return filePointer + count;
}
/** Override unbuffered implementation. */
public void close() throws IOException {
if (!isReadOnly)
flushBuffer();
super.close();
}
}
/** Sorts key/value pairs in a sequence-format file.
*
* <p>For best performance, applications should make sure that the {@link
* Writable#readFields(DataInput)} implementation of their keys is
* very efficient. In particular, it should avoid allocating memory.
*/
public static class Sorter {
private static final int FACTOR = NutchConf.getInt("io.sort.factor", 100);
private static final int MEGABYTES = NutchConf.getInt("io.sort.mb", 100);
private WritableComparator comparator;
private String inFile; // when sorting
private String[] inFiles; // when merging
private String outFile;
private int memory = MEGABYTES * 1024*1024; // bytes
private int factor = FACTOR; // merged per pass
private Class keyClass;
private Class valClass;
/** Sort and merge files containing the named classes. */
public Sorter(Class keyClass, Class valClass) {
this(new WritableComparator(keyClass), valClass);
}
/** Sort and merge using an arbitrary {@link WritableComparator}. */
public Sorter(WritableComparator comparator, Class valClass) {
this.comparator = comparator;
this.keyClass = comparator.getKeyClass();
this.valClass = valClass;
}
/** Set the number of streams to merge at once.*/
public void setFactor(int factor) { this.factor = factor; }
/** Get the number of streams to merge at once.*/
public int getFactor() { return factor; }
/** Set the total amount of buffer memory, in bytes.*/
public void setMemory(int memory) { this.memory = memory; }
/** Get the total amount of buffer memory, in bytes.*/
public int getMemory() { return memory; }
/** Perform a file sort.*/
public void sort(String inFile, String outFile) throws IOException {
this.inFile = inFile;
this.outFile = outFile;
File file = new File(outFile);
if (file.exists())
throw new IOException("already exists: " + file);
int segments = sortPass();
int pass = 1;
while (segments > 1) {
segments = mergePass(pass, segments <= factor);
pass++;
}
}
private int sortPass() throws IOException {
LOG.fine("running sort pass");
SortPass sortPass = new SortPass(); // make the SortPass
try {
return sortPass.run(); // run it
} finally {
sortPass.close(); // close it
}
}
private class SortPass {
private int limit = memory/4;
private DataOutputBuffer buffer = new DataOutputBuffer();
private byte[] rawBuffer;
private int[] starts = new int[1024];
private int[] pointers = new int[starts.length];
private int[] pointersCopy = new int[starts.length];
private int[] keyLengths = new int[starts.length];
private int[] lengths = new int[starts.length];
private Reader in;
private BufferedRandomAccessFile out;
public SortPass() throws IOException {
in = new Reader(inFile);
}
public int run() throws IOException {
int segments = 0;
boolean atEof = false;
while (!atEof) {
int count = 0;
buffer.reset();
while (!atEof && buffer.getLength() < limit) {
int start = buffer.getLength(); // read an entry into buffer
int keyLength = in.next(buffer);
int length = buffer.getLength() - start;
if (keyLength == -1) {
atEof = true;
break;
}
if (count == starts.length)
grow();
starts[count] = start; // update pointers
pointers[count] = count;
lengths[count] = length;
keyLengths[count] = keyLength;
count++;
}
// buffer is full -- sort & flush it
LOG.finer("flushing segment " + segments);
rawBuffer = buffer.getData();
sort(count);
flush(count, segments==0 && atEof);
segments++;
}
return segments;
}
public void close() throws IOException {
in.close();
if (out != null)
out.close();
}
private void grow() {
int newLength = starts.length * 3 / 2;
starts = grow(starts, newLength);
pointers = grow(pointers, newLength);
pointersCopy = new int[newLength];
keyLengths = grow(keyLengths, newLength);
lengths = grow(lengths, newLength);
}
private int[] grow(int[] old, int newLength) {
int[] result = new int[newLength];
System.arraycopy(old, 0, result, 0, old.length);
return result;
}
private void flush(int count, boolean done) throws IOException {
if (out == null) {
String outName = done ? outFile : outFile+".0";
out = new BufferedRandomAccessFile(outName, false);
}
if (!done) { // an intermediate file
long length = buffer.getLength() + count*8;
out.writeLong(length); // write size
}
Writer writer = new Writer(out, keyClass, valClass);
for (int i = 0; i < count; i++) { // write in sorted order
int p = pointers[i];
writer.append(rawBuffer, starts[p], lengths[p], keyLengths[p]);
}
}
private void sort(int count) {
System.arraycopy(pointers, 0, pointersCopy, 0, count);
mergeSort(pointersCopy, pointers, 0, count);
}
private int compare(int i, int j) {
return comparator.compare(rawBuffer, starts[i], keyLengths[i],
rawBuffer, starts[j], keyLengths[j]);
}
private void mergeSort(int src[], int dest[], int low, int high) {
int length = high - low;
// Insertion sort on smallest arrays
if (length < 7) {
for (int i=low; i<high; i++)
for (int j=i; j>low && compare(dest[j-1], dest[j])>0; j--)
swap(dest, j, j-1);
return;
}
// Recursively sort halves of dest into src
int mid = (low + high) >> 1;
mergeSort(dest, src, low, mid);
mergeSort(dest, src, mid, high);
// If list is already sorted, just copy from src to dest. This is an
// optimization that results in faster sorts for nearly ordered lists.
if (compare(src[mid-1], src[mid]) <= 0) {
System.arraycopy(src, low, dest, low, length);
return;
}
// Merge sorted halves (now in src) into dest
for(int i = low, p = low, q = mid; i < high; i++) {
if (q>=high || p<mid && compare(src[p], src[q]) <= 0)
dest[i] = src[p++];
else
dest[i] = src[q++];
}
}
private void swap(int x[], int a, int b) {
int t = x[a];
x[a] = x[b];
x[b] = t;
}
}
private int mergePass(int pass, boolean last) throws IOException {
LOG.fine("running merge pass=" + pass);
MergePass mergePass = new MergePass(pass, last);
try { // make a merge pass
return mergePass.run(); // run it
} finally {
mergePass.close(); // close it
}
}
private class MergePass {
private int pass;
private boolean last;
private MergeQueue queue;
private RandomAccessFile in;
private String inName;
public MergePass(int pass, boolean last) throws IOException {
this.pass = pass;
this.last = last;
this.queue = new MergeQueue(factor, last ? outFile : outFile+"."+pass);
this.inName = outFile+"."+(pass-1);
this.in = new RandomAccessFile(inName, "r");
}
public void close() throws IOException {
in.close(); // close and delete input
new File(inName).delete();
queue.close(); // close queue
}
public int run() throws IOException {
int segments = 0;
long end = in.length();
while (in.getFilePointer() < end) {
LOG.finer("merging segment " + segments);
long totalLength = 0;
while (in.getFilePointer() < end && queue.size() < factor) {
long length = in.readLong();
totalLength += length;
Reader reader = new Reader(inName, memory/(factor+1),
in.getFilePointer(), length);
MergeStream ms = new MergeStream(reader); // add segment to queue
if (ms.next())
queue.put(ms);
in.seek(reader.end);
}
if (!last) // intermediate file
queue.out.writeLong(totalLength); // write sizes
queue.merge(); // do a merge
segments++;
}
return segments;
}
}
/** Merge the provided files.*/
public void merge(String[] inFiles, String outFile) throws IOException {
this.inFiles = inFiles;
this.outFile = outFile;
this.factor = inFiles.length;
File file = new File(outFile);
if (file.exists())
throw new IOException("already exists: " + file);
MergeFiles mergeFiles = new MergeFiles();
try { // make a merge pass
mergeFiles.run(); // run it
} finally {
mergeFiles.close(); // close it
}
}
private class MergeFiles {
private MergeQueue queue;
public MergeFiles() throws IOException {
this.queue = new MergeQueue(factor, outFile);
}
public void close() throws IOException {
queue.close();
}
public void run() throws IOException {
LOG.finer("merging files=" + inFiles.length);
for (int i = 0; i < inFiles.length; i++) {
String inFile = inFiles[i];
MergeStream ms =
new MergeStream(new Reader(inFile, memory/(factor+1)));
if (ms.next())
queue.put(ms);
}
queue.merge();
}
}
private class MergeStream {
private Reader in;
private DataOutputBuffer buffer = new DataOutputBuffer();
private int keyLength;
public MergeStream(Reader reader) throws IOException {
if (reader.keyClass != keyClass)
throw new IOException("wrong key class: " + reader.getKeyClass() +
" is not " + keyClass);
if (reader.valClass != valClass)
throw new IOException("wrong value class: "+reader.getValueClass()+
" is not " + valClass);
this.in = reader;
}
public boolean next() throws IOException {
buffer.reset();
keyLength = in.next(buffer);
return keyLength >= 0;
}
}
private class MergeQueue extends PriorityQueue {
private BufferedRandomAccessFile out;
public MergeQueue(int size, String outName) throws IOException {
initialize(size);
this.out =
new BufferedRandomAccessFile(outName, memory/(factor+1), false);
}
protected boolean lessThan(Object a, Object b) {
MergeStream msa = (MergeStream)a;
MergeStream msb = (MergeStream)b;
return comparator.compare(msa.buffer.getData(), 0, msa.keyLength,
msb.buffer.getData(), 0, msb.keyLength) < 0;
}
public void merge() throws IOException {
Writer writer = new Writer(out, keyClass, valClass);
while (size() != 0) {
MergeStream ms = (MergeStream)top();
DataOutputBuffer buffer = ms.buffer; // write top entry
writer.append(buffer.getData(), 0, buffer.getLength(), ms.keyLength);
if (ms.next()) { // has another entry
adjustTop();
} else {
pop(); // done with this file
ms.in.close();
}
}
}
public void close() throws IOException {
MergeStream ms; // close inputs
while ((ms = (MergeStream)pop()) != null) {
ms.in.close();
}
out.close(); // close output
}
}
}
}